import pandas as pd


# 项目需求3：对需求2中的多个合并文件进行数据清洗，清洗要求去除异常数据，并按照经纬度添加city列。
def AddCity():
    # 源文件目录差异化部分
    filelist = ["DMYS",
                "DMYS-HF",
                "H500",
                "PLAM-HF",
                "PM25",
                "PM25"
                ]
    path = "C:/Users/24350/IdeaProjects/Meteorological_BigData/initialdata"  # 源文件目录公共部分
    flag = 1  # 用于遍历分类文件夹
    for file in filelist:  # 遍历文件列表
        df = pd.read_table(path + str(flag) + "/" + file + ".txt", sep="\t", header=0, low_memory=False)  # 读文件
        df = df[~df.isin([9999])].dropna(axis=0)  # 去除文件中的异常数据（异常值为9999.0）
        # 添加city列，并根据经纬度范围确定城市
        df.loc[(df["经度"] >= 115.19) & (df["经度"] <= 117.19) & (df["纬度"] >= 38.88) & (df["纬度"] <= 40.88), "city"] = "北京"
        df.loc[(df["经度"] >= 120.49) & (df["经度"] <= 122.49) & (df["纬度"] >= 30.41) & (df["纬度"] <= 32.41), "city"] = "上海"
        df.loc[(df["经度"] >= 116.31) & (df["经度"] <= 118.31) & (df["纬度"] >= 38.72) & (df["纬度"] <= 40.72), "city"] = "天津"
        df.loc[(df["经度"] >= 115.75) & (df["经度"] <= 117.75) & (df["纬度"] >= 35.56) & (df["纬度"] <= 37.56), "city"] = "济南"
        df.loc[(df["经度"] >= 119.40) & (df["经度"] <= 121.40) & (df["纬度"] >= 35.31) & (df["纬度"] <= 37.31), "city"] = "青岛"
        df.loc[(df["经度"] >= 117.03) & (df["经度"] <= 119.03) & (df["纬度"] >= 23.48) & (df["纬度"] <= 25.48), "city"] = "厦门"
        df.loc[(df["经度"] >= 112.64) & (df["经度"] <= 114.64) & (df["纬度"] >= 33.72) & (df["纬度"] <= 35.72), "city"] = "郑州"
        df.loc[(df["经度"] >= 87.31) & (df["经度"] <= 89.31) & (df["纬度"] >= 42.36) & (df["纬度"] <= 44.36), "city"] = "乌鲁木齐"
        df.loc[(df["经度"] >= 103.10) & (df["经度"] <= 105.10) & (df["纬度"] >= 29.66) & (df["纬度"] <= 31.66), "city"] = "成都"
        df.loc[(df["经度"] >= 110.62) & (df["经度"] <= 112.62) & (df["纬度"] >= 39.81) & (df["纬度"] <= 41.81), "city"] = "呼和浩特"
        df.loc[(df["经度"] >= 109.33) & (df["经度"] <= 111.33) & (df["纬度"] >= 19.03) & (df["纬度"] <= 21.03), "city"] = "海口"
        df = df[df["city"].notnull()]  # 选取city列不为空的
        # 保存文件，并设置float类型数据的格式为保留两位小数
        df.to_csv(path + str(flag) + "/" + file + "-AddCity.txt", float_format="%.2f", mode="a+", sep="\t", index=False)
        flag += 1  # 读取下一个分类文件夹


if __name__ == '__main__':
    AddCity()